---
title: "Data Replication"
author: "Digvijay Ghotane"
output:
  pdf_document: default
  html_document: default
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, results = 'hide', message=F, error=F, warning=F)
```

This file is for replication of the data used for statistical analysis.  

# Libraries  

```{r}
require(tidyverse)
require(stargazer)
require(lubridate)
require(zoo)
library(mfx)
library(imputeTS)
```

# The Data  
## Dependent Variable  
### 1. Stone Pelting Data:  

```{r}
satp = read.csv('data/stone_pelt/stone_pelting_data_satp_final.csv')
newspaper = read.csv('data/stone_pelt/stone_pelting_newspapers_final.csv')
acled = read.csv("data/stone_pelt/acled.csv")
khalid = read.csv('data/stone_pelt/khalid.csv')

## ACLED
acled = acled %>% filter(admin1 == "Jammu and Kashmir")
acled = 
  acled %>% 
  dplyr::select(date =event_date, 
                type =event_type, 
                sub_type = sub_event_type,
                district = admin2,
                geo_precision,
                time_precision,
                notes,
                fatalities) %>% 
  mutate(date=lubridate::dmy(date)) %>% 
  filter(str_detect(notes, "stone") | 
           str_detect(notes, "stones") | 
           str_detect(notes, "stone pelt") | 
           str_detect(notes, "pelt")) %>% 
  mutate(event = 1) %>% 
  group_by(date, district) %>% 
  summarize(fatalities = sum(fatalities), acled_stone = sum(event)) %>% 
  ungroup() %>% mutate(month = as.integer(format(as.Date(date), "%m")),
         year = as.integer(format(as.Date(date), "%Y"))) %>% 
  group_by(district) %>% 
  mutate(dupes = duplicated(date)) %>% 
  filter(dupes != TRUE) %>% 
  dplyr::select(-dupes) %>% 
    ungroup()
acled$district <- gsub("Shopian", "Shupiyan", acled$district)
acled$district <- gsub("Rajouri", "Rajauri", acled$district)
acled$district <- gsub("Budgam", "Badgam", acled$district)
acled$district <- gsub("Bandipora", "Bandipur", acled$district)


## SATP
names(satp)[4] = 'incidents'

satp = satp %>% 
  mutate(Date = as.Date(Date),
         year = as.numeric(format(Date, "%Y")),
         Place = na_if(Place, "")) %>% 
  dplyr::select(date = Date, 
                year, 
                district = Place, 
                incidents, 
                NumberOfArrests = 'No.of.Arrests') %>% 
  mutate(date = as.Date(date), district = as.character(district)) %>% 
  dplyr::filter(is.na(district) == FALSE) %>% 
  group_by(date, district) %>% 
  mutate(incidents = sum(incidents)) %>% 
  ungroup()

satp = satp %>% filter(district != 'Jammu & Kashmir')
satp = satp %>% 
  group_by(district) %>% 
  mutate(dupes = duplicated(date)) %>% 
  filter(dupes != TRUE) %>% 
  dplyr::select(-dupes) %>% 
  ungroup()
satp = as.data.frame(satp)
## Newspapers
newspaper = newspaper %>% dplyr::mutate(date = as.Date(date, format = "%m/%d/%y"),
                     year = as.numeric(format(date, "%Y")),
                     district = as.character(District),
                     district = na_if(district, "")) %>%  ## Changing blank district values to NA
  dplyr::select(date, year, incident = stone_pelt_incident, district) %>% 
  dplyr::filter(is.na(district) == FALSE) %>% 
  mutate(district = ifelse(district == "Anantnag ", "Anantnag", district),
         district = ifelse(district == "Bandipora" | district == 'Bandipore', "Bandipur", district),
         district = ifelse(district == 'Budgam', "Badgam", district),
         district = ifelse(district == 'Rajouri', "Rajauri", district)) %>% 
  mutate(incident = 1) %>% 
  group_by(district) %>% 
  mutate(dupes = duplicated(date)) %>% 
  filter(dupes != TRUE) %>% 
  dplyr::select(-dupes) %>% 
  ungroup()
```

## Control Variables  

### 2.	Climate Data:    

```{r}
climate = read.csv('data/climate/climate_data.csv')
climate = climate %>%
  mutate(date=as.Date(date_time),
         district = as.character(district)) %>% 
  dplyr::select(date, district, AmountOfSnowInCM = totalSnow_cm, 
                AmountOfPrecipitationInMM = precipMM, 
                TemperatureInC = tempC, 
                LengthOfDaylight = sunHour)
climate = climate %>% 
  mutate(district = ifelse(district == 'Baramula', "Baramulla", district),
         LengthOfDaylight = (LengthOfDaylight*60))
```


### 3. Onion Prices Data:  

```{r}
onion = read.csv('data/onion/onionprice.csv')
onion = 
  onion %>% mutate(date = lubridate::dmy(date))  %>% 
  dplyr::select(date,
                district,
                onion_price = 'Modal.Price..Rs..Quintal.') %>% filter(onion_price != 0) %>% 
  group_by(date, district) %>% 
  mutate(onion_price = mean(onion_price)) %>%
  ungroup() %>% 
  group_by(district) %>% 
  mutate(dupes = duplicated(date)) %>% 
  filter(dupes != TRUE) %>% 
  dplyr::select(-dupes)

onion$district = gsub("Rajouri", "Rajauri", onion$district)
```

```{r}
## Adjusting for inflation, baseline = 2012
onion = onion %>% 
  mutate(year = as.integer(format(as.Date(date), "%Y"))) %>% 
  mutate(onion_price = ifelse(year == 2020, (onion_price/138.8)*100, onion_price),
         onion_price = ifelse(year == 2019, (onion_price/134.8)*100, onion_price),
         onion_price = ifelse(year == 2018, (onion_price/129.7)*100, onion_price),
         onion_price = ifelse(year == 2017, (onion_price/124.9)*100, onion_price),
         onion_price = ifelse(year == 2016, (onion_price/120.9)*100, onion_price),
         onion_price = ifelse(year == 2015, (onion_price/118.4)*100, onion_price),
         onion_price = ifelse(year == 2014, (onion_price/114.6)*100, onion_price),
         onion_price = ifelse(year == 2013, (onion_price/107.9)*100, onion_price),
         onion_price = ifelse(year == 2012, (onion_price/100)*100, onion_price),
         onion_price = ifelse(year == 2011, (onion_price/146.5)*100, onion_price),
         onion_price = ifelse(year == 2010, (onion_price/135.1)*100, onion_price)) %>% 
  dplyr::select(-year)
```

## Demographics Data
```{r message = F}
demo = read_csv('data/demographics/demo.csv')
demo = demo %>% dplyr::select(district = District, 
                              MajorityReligion, 
                              UrbanVSRural, 
                              PopulationGroup, 
                              PopulationDensity, 
                              PercentageUrbanPopulation)
```

## Joining the Newspaper, SATP + ACLED Dataset  
```{r}
## Joining Newspaper & SATP Data
final = full_join(newspaper,
                  satp,
                  by=c('date','district')) ## Joining by:
                                           ## Date & District
## Joining ACLED to it
final = full_join(final, 
                  acled,
                  by = c("date", "district"))

```

```{r}
## Stone Pelting: Binary values of either 0 = no incident, or 1 = incident
## We check for whether an incident was reported in either of the datasets,
## i.e. ACLED, SATP or Newspapers and input 1 for the day if there was even
## 1 stone pelting incident.
final = final %>% mutate(stone = ifelse(is.na(incident) == FALSE |
                          is.na(incidents) == FALSE | 
                            is.na(acled_stone) == FALSE, 1, 0))
## Joining Climate Data
final = full_join(climate,
                  final,
                  by = c('date', 'district'))

final = final %>%  dplyr::select(date, ## Date "Y%%%-M%-D%"
                district, ## District
                LengthOfDaylight, ## DaylightTime
                AmountOfSnowInCM, ## Snow in cm
                AmountOfPrecipitationInMM, ## Precipitation in mm
                TemperatureInC, ## Temperature in Celcius
                NumberOfArrests, ## No. of Arrests
                stone) ## Dichotomous variable
                
## Fixing stone pelting for NA's with 0.
## Note: assumption: all other days except for the ones we have data on
## have no stone pelting incident on that day
final = final %>% mutate(stone = ifelse(is.na(stone)==T, 0, stone))
```


```{r}
## Ramzan dates taken from https://www.calendardate.com/ramadan_201x.htm
final = final %>% 
  ungroup() %>% 
  mutate(RamzanDummy1IsRamzan = 
           ifelse(date %in% 
                    c(seq(as.Date("2010-08-11"), as.Date("2010-09-09"), 'days'), ## 2010 
                      seq(as.Date("2011-08-01"), as.Date("2011-08-30"), 'days'), ## 2011
                      seq(as.Date("2012-07-20"), as.Date("2012-08-18"), 'days'), ## 2012
                      seq(as.Date("2013-07-09"), as.Date("2013-08-07"), 'days'), ## 2013
                      seq(as.Date("2014-06-29"), as.Date("2014-07-28"), 'days'), ## 2014
                      seq(as.Date("2015-06-18"), as.Date("2015-07-17"), 'days'), ## 2015
                      seq(as.Date("2016-06-07"), as.Date("2016-07-05"), 'days'), ## 2016
                      seq(as.Date("2017-05-27"), as.Date("2017-06-24"), 'days'), ## 2017
                      seq(as.Date("2018-05-16"), as.Date("2018-06-14"), 'days'), ## 2018
                      seq(as.Date("2019-05-06"), as.Date("2019-06-04"), 'days')), ## 2019
                  1, 0))

## Subsetting the dataset from 2013 August to December 2017
final = final %>% filter(date>= as.Date("2013-08-01") & date <= as.Date("2017-12-31")) 

## Joining Onion prices
final = left_join(final, onion, by = c('date','district'))

## Adding Month & Year column
final = 
  final %>% 
  mutate(month = as.integer(format(as.Date(date), "%m")),
         year = as.integer(format(as.Date(date), "%Y")))

## Adding Friday
final = final %>% 
  mutate(Friday = weekdays(date)) %>% 
  mutate(Friday = ifelse(Friday == 'Friday', 1, 0)) 

## Adding demonetization and killing of Burhan Wani
final = final %>% 
  mutate(Demonetization = ifelse(date >= as.Date("2016-11-09"), 1, 0),
         KillingBW = ifelse(date >= as.Date("2016-07-09"), 1, 0))  

## Joining demographic data
final = left_join(final, demo, by = c('district')) 

```
```{r}
final = final %>% 
  filter(district != 'Reasi')
```

## Interpolating Onion Prices by Average of value's between NA's  
### Districts with certain onion price data
```{r}
final1 = final %>% filter(district == 'Anantnag' |
                   district == 'Badgam' |
                   district == 'Baramulla' |
                   district == 'Jammu' |
                   district == 'Kathua' |
                   district == 'Pulwama' |
                   district == 'Rajauri' |
                   district == 'Srinagar' |
                   district == 'Udhampur')
```

```{r}
final1 = final1 %>% arrange(district, date) %>% group_by(district) %>% mutate(onion_price = (na_locf(onion_price) + rev(na_locf(rev(onion_price))))/2)
```

### Districts with no data
```{r}
final2 = final %>% filter(district != 'Anantnag' &
                   district != 'Badgam' &
                   district != 'Baramulla' &
                   district != 'Jammu' &
                   district != 'Kathua' &
                   district != 'Pulwama' &
                   district != 'Rajauri' &
                   district != 'Srinagar' &
                   district != 'Udhampur')
```

```{r}
final3 = bind_rows(final1, final2)
```

```{r}
final3 = final3 %>% 
  arrange(district, date) %>% 
  group_by(date) %>% 
  mutate(onion_price = (na_locf(onion_price) + 
                          rev(na_locf(rev(onion_price))))/2)
```

```{r}
final = final3
```

```{r}
head(final)
```

```{r}
## Saving as CSV
final %>% write_csv(., 'data/output_data/final_onion_fixed.csv')
```

